{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 4.朴素贝叶斯\n",
    "## 调包"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "from math import log\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "## 4.1.文本分类\n",
    "### 4.1.1.词表到向量的转换函数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def loadDataSet():\n",
    "    \"\"\"\n",
    "    生成一个文本数据集和标签\n",
    "    参数：\n",
    "        无\n",
    "    返回：\n",
    "        postingList -- 文本列表\n",
    "        classVec -- 标签分类\n",
    "    \n",
    "    \"\"\"\n",
    "    postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],\n",
    "                 ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],\n",
    "                 ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],\n",
    "                 ['stop', 'posting', 'stupid', 'worthless', 'garbage'],\n",
    "                 ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],\n",
    "                 ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]\n",
    "    #0-非侮辱性，1-侮辱性\n",
    "    classVec = [0,1,0,1,0,1]    \n",
    "    return postingList,classVec"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def createVocabList(dataSet):\n",
    "    \"\"\"\n",
    "    建立词汇表，也就是所有文本的并集\n",
    "    参数：\n",
    "        dataSet -- 数据集\n",
    "    返回：\n",
    "        vocabSet -- 词汇表\n",
    "    \"\"\"\n",
    "    #首先建立一个空集，新建一个集合，避免直接修改原数据\n",
    "    vocabSet = set([])\n",
    "    #遍历数据集中的每个文档\n",
    "    for document in dataSet:\n",
    "        #每个文档和之前的词汇表求并集，保证没有重复\n",
    "        vocabSet = vocabSet | set(document)\n",
    "    return list(vocabSet)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def setOfWords2Vec(vocabList, inputSet):\n",
    "    \"\"\"\n",
    "    把句子嵌入到词汇表中，也就是用表达一个句子\n",
    "    向量中为1，代表词汇表对应该处的词汇；反之，不对应该处词汇\n",
    "    参数：\n",
    "        vocabList -- 词汇表\n",
    "        inputSet -- 输入词汇集合\n",
    "    返回：\n",
    "        returnVec -- 返回句子对应的向量\n",
    "    \"\"\"\n",
    "    #初始化返回句子向量，长度为词汇表长度的全0向量\n",
    "    returnVec = [0]*len(vocabList)\n",
    "    #遍历输入集中的所有词汇\n",
    "    for word in inputSet:\n",
    "        #如果该词汇在词汇表中\n",
    "        if word in vocabList:\n",
    "            #对应序号元素置1\n",
    "            returnVec[vocabList.index(word)] = 1\n",
    "        #反之给出错误提示\n",
    "        else: \n",
    "            print(\"词汇“{}”不在词汇表中\".format(word))\n",
    "    return returnVec"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "检验函数的执行效果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "myVocabList = ['how', 'has', 'him', 'stop', 'stupid', 'garbage', 'so', 'not', 'quit', 'flea', 'posting', 'to', 'love', 'steak', 'problems', 'food', 'worthless', 'mr', 'please', 'dalmation', 'maybe', 'take', 'buying', 'dog', 'my', 'I', 'licks', 'cute', 'ate', 'park', 'help', 'is']\n"
     ]
    }
   ],
   "source": [
    "listOPosts, listClasses = loadDataSet()\n",
    "myVocabList = createVocabList(listOPosts)\n",
    "print(\"myVocabList = {}\".format(myVocabList))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "函数setOfWords2Vec()的运行效果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "第0条句子对应的向量 = [0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0]\n",
      "第3条句子对应的向量 = [0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n"
     ]
    }
   ],
   "source": [
    "print(\"第0条句子对应的向量 = {}\".format(setOfWords2Vec(myVocabList, listOPosts[0])))\n",
    "print(\"第3条句子对应的向量 = {}\".format(setOfWords2Vec(myVocabList, listOPosts[3])))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 4.1.2.从词向量计算概率"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def trainNB0(trainMatrix,trainCategory):\n",
    "    \"\"\"\n",
    "    训练朴素贝叶斯分类器\n",
    "    参数：\n",
    "        trainMatrix -- 输入的训练矩阵\n",
    "        trainCategory -- 训练标签\n",
    "    返回：\n",
    "        p0Vect -- 非侮辱性词条概率\n",
    "        p1Vect -- 侮辱性词条概率\n",
    "        pAbusive -- 侮辱性概率\n",
    "    \"\"\"\n",
    "    #将数据转换为矩阵形式\n",
    "    trainData = np.array(trainMatrix)\n",
    "    trainLabels = np.array(trainCategory)\n",
    "    #训练文本数\n",
    "    numTrainDocs = trainData.shape[0]\n",
    "    #训练词条数\n",
    "    numWords = trainData.shape[1]\n",
    "    #侮辱性文本的概率为训练标签取均值\n",
    "    pAbusive = np.mean(trainLabels)\n",
    "    #初始化为全1\n",
    "    p0Num = np.ones((1, numWords)); p1Num = np.ones((1, numWords))\n",
    "    #分母初始化为2\n",
    "    #两项改动为了是避免连乘中一项为0，导致整体结果为0的情况\n",
    "    #Bayes分类只用比较概率的相对大小，所以这么做不会影响结果\n",
    "    p0Denom = 2.0; p1Denom = 2.0\n",
    "    #p1Num为统计每个侮辱性词条出现次数的向量，等于文本中标记1的子矩阵，沿axis = 0相加\n",
    "    p1Num += np.sum(trainData[trainLabels == 1, :], axis = 0, keepdims = True)\n",
    "    #p1Denom为统计侮辱性词条总数的标量，等于文本中标记1的子矩阵向量之和\n",
    "    p1Denom += np.sum(trainData[trainLabels == 1, :])\n",
    "    #p0同上\n",
    "    p0Num += np.sum(trainData[trainLabels == 0, :], axis = 0, keepdims = True)\n",
    "    p0Denom += np.sum(trainData[trainLabels == 0, :])\n",
    "    #对概率取对数，可以避免下溢出或者浮点数舍入误差，同样由于相对大小不变，不影响结果\n",
    "    p1Vect = np.log(p1Num/p1Denom)\n",
    "    p0Vect = np.log(p0Num/p0Denom)\n",
    "    return p0Vect,p1Vect,pAbusive"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "概率的计算采用贝叶斯公式\n",
    "$$p(c_{i}|W)=\\frac{p(W|c_{i})p(c_{i})}{p(W)}$$\n",
    "又根据朴素假设，所有词汇概率统计独立\n",
    "$$p(W|c_{i})=p(w_{0},w_{1},w_{2},...,w_{n}|c_{i})=p(w_{0}|c_{i})\\cdot p(w_{1}|c_{i})\\cdot p(w_{2}|c_{i})\\cdot ...\\cdot p(w_{n}|c_{i})$$"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):\n",
    "    \"\"\"\n",
    "    计算概率，比较概率，得出结果\n",
    "    由于所有概率均取了对数，所以乘法变加法，除法变减法\n",
    "    由于只需要比较相对大小，相同的p(W)也就不用除了\n",
    "    参数：\n",
    "        vec2Classify -- 要分类的文本向量\n",
    "        p0Vec -- p0向量\n",
    "        p1Vec -- p1向量\n",
    "        pClass1 -- 类别1的概率\n",
    "    返回：\n",
    "        判断结果 -- 0或1\n",
    "    \"\"\"\n",
    "    #元素间乘法\n",
    "    p1 = np.sum(vec2Classify * p1Vec) + np.log(pClass1)\n",
    "    p0 = np.sum(vec2Classify * p0Vec) + np.log(1.0 - pClass1)\n",
    "    if p1 > p0:\n",
    "        return 1\n",
    "    else: \n",
    "        return 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def testingNB():\n",
    "    \"\"\"\n",
    "    测试朴素贝叶斯分类器\n",
    "    参数：\n",
    "        无\n",
    "    返回：\n",
    "        无 -- 直接打印结果\n",
    "    \"\"\"\n",
    "    #读取之前的数据\n",
    "    listOPosts,listClasses = loadDataSet()\n",
    "    #产生词汇表\n",
    "    myVocabList = createVocabList(listOPosts)\n",
    "    #初始化一个训练矩阵\n",
    "    trainMat=[]\n",
    "    #遍历文档\n",
    "    for postinDoc in listOPosts:\n",
    "        #转化为向量后，添加到训练矩阵中\n",
    "        trainMat.append(setOfWords2Vec(myVocabList, postinDoc))\n",
    "    #得到三个概率\n",
    "    p0V,p1V,pAb = trainNB0(np.array(trainMat),np.array(listClasses))\n",
    "    #待测试词组\n",
    "    testEntry = ['love', 'my', 'dalmation']\n",
    "    #词组转为矩阵\n",
    "    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))\n",
    "    #打印结果\n",
    "    print(\"{} classified as: {}\".format(testEntry, classifyNB(thisDoc,p0V,p1V,pAb)))\n",
    "    #待测试词组\n",
    "    testEntry = ['stupid', 'garbage']\n",
    "    #词组转为矩阵\n",
    "    thisDoc = np.array(setOfWords2Vec(myVocabList, testEntry))\n",
    "    #打印结果\n",
    "    print(\"{} classified as: {}\".format(testEntry, classifyNB(thisDoc,p0V,p1V,pAb)))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "测试"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['love', 'my', 'dalmation'] classified as: 0\n",
      "['stupid', 'garbage'] classified as: 1\n"
     ]
    }
   ],
   "source": [
    "testingNB()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 4.1.3文档的词袋模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def bagOfWords2VecMN(vocabList, inputSet):\n",
    "    \"\"\"\n",
    "    文档的词袋模型\n",
    "    参数：\n",
    "        vocabList -- 词汇列表\n",
    "        inputSet -- 输入词集\n",
    "    返回：\n",
    "        returnVec -- 返回向量\n",
    "    \"\"\"\n",
    "    returnVec = [0]*len(vocabList)\n",
    "    for word in inputSet:\n",
    "        if word in vocabList:\n",
    "            #如果在词库中，对应词条就++\n",
    "            returnVec[vocabList.index(word)] += 1\n",
    "    return returnVec"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4.2.使用朴素贝叶斯过滤垃圾邮件\n",
    "### 4.2.1.使用朴素贝叶斯进行交叉验证"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def textParse(bigString):\n",
    "    \"\"\"\n",
    "    输入很长的字符串，转换为向量\n",
    "    参数：\n",
    "        bigString -- 长字符串\n",
    "    返回：\n",
    "        去掉少于两个字符，转换为小写的字符串\n",
    "    \"\"\"\n",
    "    import re\n",
    "    listOfTokens = re.split(r'\\W*', bigString)\n",
    "    return [tok.lower() for tok in listOfTokens if len(tok) > 2] "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def spamTest():\n",
    "    \"\"\"\n",
    "    垃圾邮件测试\n",
    "    参数：\n",
    "        无\n",
    "    返回：\n",
    "        无\n",
    "    \"\"\"\n",
    "    #新建三个列表\n",
    "    docList=[]; classList = []; fullText =[]\n",
    "    #遍历垃圾邮件和正常邮件，各25个\n",
    "    for i in range(1,26):\n",
    "        #读取垃圾邮件\n",
    "        wordList = textParse(open(\"email/spam/{}.txt\".format(i), errors = 'ignore').read())\n",
    "        #添加到列表\n",
    "        docList.append(wordList)\n",
    "        fullText.extend(wordList)\n",
    "        #添加到类\n",
    "        classList.append(1)\n",
    "        #读取正常邮件\n",
    "        #ham中的23.txt总是报错有不能解读的字节，选择忽略该错误\n",
    "        wordList = textParse(open(\"email/ham/{}.txt\".format(i), errors = 'ignore').read())\n",
    "        docList.append(wordList)\n",
    "        fullText.extend(wordList)\n",
    "        classList.append(0)\n",
    "    #创建词汇表\n",
    "    vocabList = createVocabList(docList)\n",
    "    #训练集和测试集序号集\n",
    "    trainingSet = list(range(50)); testSet=[]\n",
    "    #随机抽取训练集中的10个序号，放入测试集\n",
    "    for i in range(10):\n",
    "        #生成随机序号\n",
    "        randIndex = np.int(np.random.uniform(0,len(trainingSet)))\n",
    "        #序号对应的元素由训练集移动到测试集中\n",
    "        testSet.append(trainingSet[randIndex])\n",
    "        del(trainingSet[randIndex]) \n",
    "    #新建训练矩阵和训练标签\n",
    "    trainMat=[]; trainClasses = []\n",
    "    #对于训练集中的元素\n",
    "    for docIndex in trainingSet:\n",
    "        #对应词袋添加到训练矩阵中\n",
    "        trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))\n",
    "        #类别添加到标签中\n",
    "        trainClasses.append(classList[docIndex])\n",
    "    #训练朴素贝叶斯分类器\n",
    "    p0V,p1V,pSpam = trainNB0(np.array(trainMat),np.array(trainClasses))\n",
    "    #错误计数器初始化为0\n",
    "    errorCount = 0\n",
    "    #对于测试集\n",
    "    for docIndex in testSet:\n",
    "        #得到词袋向量\n",
    "        wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])\n",
    "        #判断结果\n",
    "        if classifyNB(np.array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:\n",
    "            #统计错误\n",
    "            errorCount += 1\n",
    "            #打印错误信息\n",
    "            print(\"错误序号为：{}\".format(docList[docIndex]))\n",
    "    print(\"总准确率为：{}\".format(1 - np.float(errorCount)/len(testSet)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "测试结果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "错误序号为：['oem', 'adobe', 'microsoft', 'softwares', 'fast', 'order', 'and', 'download', 'microsoft', 'office', 'professional', 'plus', '2007', '2010', '129', 'microsoft', 'windows', 'ultimate', '119', 'adobe', 'photoshop', 'cs5', 'extended', 'adobe', 'acrobat', 'pro', 'extended', 'windows', 'professional', 'thousand', 'more', 'titles']\n",
      "总准确率为：0.9\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "F:\\Anaconda\\lib\\re.py:212: FutureWarning: split() requires a non-empty pattern match.\n",
      "  return _compile(pattern, flags).split(string, maxsplit)\n"
     ]
    }
   ],
   "source": [
    "spamTest()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
